#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-InitialExperiments/SFARI_genes/R_Markdowns/')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis)
library(biomaRt)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load raw count expression matrix and pubmed counts by gene

datExpr = read.csv('./../../FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv')
pubmed_counts = read.csv('./../Data/pubmed_count_by_gene.csv')

Convert ensembl IDs to gene names

# Get gene names from Ensembl IDs
getinfo = c('ensembl_gene_id','external_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',  dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=datExpr$X, mart=mart)
## Cache found
datGenes = datGenes[match(datExpr$X, datGenes$ensembl_gene_id),]

rm(getinfo, mart)
datGenes = datGenes %>% mutate('meanExpr' = rowMeans(datExpr[,-1])) %>% 
           left_join(pubmed_counts, by=c('external_gene_id'='Gene')) %>% na.omit

Gene Expression

Very heavy right tail

ggplotly(datGenes %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
         scale_x_log10() + theme_minimal())

Number of Publications

Genes with the highest number of publications have names like MICE, LARGE, IMPACT, ACHE, SET, PIGS, NHS … so I think most of thir related publications are not about those actual genes…

summary(datGenes$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0     723      10 7711933
cat(paste0(sum(datGenes$Count==0), ' genes have 0 publications (~', round(100*mean(datGenes$Count==0)), '%) (that\'s a lot!)'))
## 38694 genes have 0 publications (~61%) (that's a lot!)
datGenes %>% arrange(desc(Count)) %>% distinct(external_gene_id, .keep_all=TRUE) %>% 
             head(20) %>% kable(caption='Top 20 genes by number of publications')
Top 20 genes by number of publications
ensembl_gene_id external_gene_id gene_biotype meanExpr Count
ENSG00000180176 TH protein_coding 5.4090909 7711933
ENSG00000177606 JUN protein_coding 1970.3750000 2604914
ENSG00000136999 NOV protein_coding 348.3522727 2332690
ENSG00000228491 MICE pseudogene 0.0000000 1634646
ENSG00000173599 PC protein_coding 803.6136364 1464096
ENSG00000133424 LARGE protein_coding 1544.2727273 1393404
ENSG00000164458 T protein_coding 0.3522727 1215682
ENSG00000154059 IMPACT protein_coding 292.6931818 910176
ENSG00000087085 ACHE protein_coding 234.3409091 818059
ENSG00000119335 SET protein_coding 1844.2500000 502402
ENSG00000062485 CS protein_coding 973.9204545 407959
ENSG00000087111 PIGS protein_coding 375.0795455 374551
ENSG00000115718 PROC protein_coding 3.9431818 359016
ENSG00000168453 HR protein_coding 701.6818182 256256
ENSG00000175084 DES protein_coding 31.4772727 217417
ENSG00000105976 MET protein_coding 780.5568182 186947
ENSG00000188158 NHS protein_coding 572.5681818 169222
ENSG00000204490 TNF protein_coding 0.0000000 167834
ENSG00000169083 AR protein_coding 163.5568182 166496
ENSG00000010610 CD4 protein_coding 204.3068182 164440

I’m going to remove all top genes until NHS (it looks like the TNF publications are actually about that gene) and then some other genes that have high counts and their names are known words (COPE, REST, MAX, ENG, SHE, CAT, COMP, CAMP, BAD, COIL, POLE, ACE, GRASP, CAST, …)

problematic_gene_names = c('COPE','REST','MAX','ENG','SHE','CAT','COMP','CAMP','BAD','COIL','POLE','ACE',
                           'GRASP','CAST','SON','CLOCK','ITCH','STAR','WARS','MARS','MARCO','ATM','CARS')

datGenes = datGenes %>% filter(Count<=167834 & !external_gene_id %in% problematic_gene_names)

ggplotly(datGenes %>% ggplot(aes(Count+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
         scale_x_log10() + theme_minimal())

Relation between publications and mean expression

There is a positive relation between these two variables and the correlation is quite high but only when transforming to logarithmic scale the variables

corr = cor(log10(datGenes$meanExpr+1), log10(datGenes$Count+1))

datGenes %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(color='#0099cc', alpha=0.03) +
             geom_smooth(color='gray', alpha=0.3) + scale_x_log10() + scale_y_log10() + 
             ggtitle(paste0('Cor = (' ,round(corr,4),') R^2 = (', round(corr^2,4),')')) + theme_minimal() + coord_fixed()

Differentiating protein coding genes from the rest, the correlation decreases when separating the genes, it seems like part of the reason why the correlation was so high because is because it was combining two behaviours, non-protein-coding genes with low levels of expression and low numbers of papers, and protein-coding genes, with higher levels of expression and of publications.

cat(paste0(round(100*(sum(datGenes$gene_biotype=='protein_coding' & datGenes$Count==0)/sum(datGenes$gene_biotype=='protein_coding'))),
           '% of the protein coding genes have zero publications'))
## 13% of the protein coding genes have zero publications
corr1 = cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype=='protein_coding'],
            log10(datGenes$Count+1)[datGenes$gene_biotype=='protein_coding'])
corr2 = cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype!='protein_coding'],
            log10(datGenes$Count+1)[datGenes$gene_biotype!='protein_coding'])
datGenes %>% mutate(protein_coding = gene_biotype=='protein_coding') %>% 
             ggplot(aes(meanExpr+1, Count+1, color=protein_coding)) + 
             geom_point(alpha=0.03) + geom_smooth() + 
             scale_x_log10() + scale_y_log10() + 
             ggtitle(paste0('Cor PC=', round(corr1,4), ', ', 'Cor nPC=', round(corr2,4))) +
             theme_minimal() + coord_fixed()

rm(corr1, corr2)

Focusing on protein coding genes

datGenesPC = datGenes %>% filter(gene_biotype=='protein_coding') %>% mutate(meanExpr_l10 = log10(meanExpr+1),
                                                                            Count_l10 = log10(Count+1))

corr = cor(datGenesPC$meanExpr_l10, datGenesPC$Count_l10)

datGenesPC %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(alpha=0.03, color='#0099cc') + geom_smooth(color='gray') + 
               scale_x_log10() + scale_y_log10() + 
               ggtitle(paste0('Correlation = ',round(corr,4),', R^2 = ', round(corr^2,4))) +
               theme_minimal() + coord_fixed()

Filtering genes with the lowest levels of expression

datGenesPC %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
               geom_vline(xintercept=10, color='gray') + scale_x_log10() + theme_minimal()

datGenesPC = datGenesPC %>% filter(meanExpr>=9)
corr = cor(datGenesPC$meanExpr_l10, datGenesPC$Count_l10)

datGenesPC %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(alpha=0.07, color='#0099cc') + geom_smooth(color='gray') + 
               scale_x_log10() + scale_y_log10() + 
               ggtitle(paste0('Correlation = ',round(corr,4),', R^2 = ', round(corr^2,4))) +
               theme_minimal() + coord_fixed()